rm(list=ls())
library("reshape2")
library("ggplot2")
library("NMF") # for heatmaps
library("ggmap") # for map plot
library("gridExtra")
#library(plyr) # to manipulate tables
Let’s import the newest tHPI data.
data.csv <- read.csv("/Users/bian/sandbox/thpi_us.csv")
data.tabular <- data.csv
head(data.tabular)
## city January February March April May June July August September
## 1 New York 243 245 297 377 410 372 317 320 430
## 2 Los Angeles 197 196 203 219 219 227 252 245 220
## 3 Chicago 154 163 213 259 333 311 299 267 293
## 4 Dallas 148 143 154 158 150 148 145 147 157
## 5 Philadelphia 172 189 207 228 246 244 200 198 247
## 6 Houston 149 157 157 161 173 146 149 143 150
## October lat lon
## 1 430 40.71278 -74.00594
## 2 219 34.05223 -118.24368
## 3 307 41.83690 -87.68470
## 4 163 32.77670 -96.79700
## 5 241 39.95000 -75.16670
## 6 154 29.76040 -95.36980
Let’s now reshape our input data to a more friendly format.
data <- melt(data.tabular[,c(1:11)], id="city")
data <- setNames(data[c("city", "variable", "value")], c("city", "month", "price"))
head(data)
## city month price
## 1 New York January 243
## 2 Los Angeles January 197
## 3 Chicago January 154
## 4 Dallas January 148
## 5 Philadelphia January 172
## 6 Houston January 149
We remove the $-sign and turn prices into numeric values.
data$price <- as.numeric(gsub("[^[:digit:]]",'',data$price))
head(data)
## city month price
## 1 New York January 243
## 2 Los Angeles January 197
## 3 Chicago January 154
## 4 Dallas January 148
## 5 Philadelphia January 172
## 6 Houston January 149
summary(data)
## city month price
## Atlanta: 10 January : 25 Min. : 98.0
## Boston : 10 February: 25 1st Qu.:150.0
## Chicago: 10 March : 25 Median :178.5
## Dallas : 10 April : 25 Mean :202.8
## Denver : 10 May : 25 3rd Qu.:234.8
## Detroit: 10 June : 25 Max. :454.0
## (Other):190 (Other) :100
Let’s convert months to Date format
data$date <-as.Date(paste("01",data$month,"15",sep=""),"%d%B%Y")
Let’s illustrate the trends
ggplot(data, aes(x=price)) + geom_density()
ggplot(data, aes(x=price,color=month)) + geom_density()
ggplot(data, aes(x=price,color=city)) + geom_density()
ggplot(data, aes(x=month,y=price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
ggplot(data, aes(x=city,y=price)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))
ggplot(data, aes(x=date,y=price)) + geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(data, aes(x=date,y=price,color=city)) + geom_point() + geom_smooth(se=FALSE)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
For ploting the heatmap, we have to use the original data.tabular which has a matrix format.
nba <- data.tabular
nba <- nba[,2:11]
nba <- sapply(nba,function(x) {as.numeric(gsub("[^[:digit:]]",'',x))})
row.names(nba) <- data.tabular$city
nba_matrix <- data.matrix(nba)
dev.off()
## null device
## 1
nba_heatmap <- heatmap(nba_matrix, Rowv=NA, Colv=NA, col = heat.colors(256), scale="row", margins=c(6,10))
Let’s do some heatmaps. First the original one, then a normalized one based on cities and last a normalized one by months.
aheatmap(nba_matrix, Rowv=FALSE, Colv=FALSE, fontsize=5, cexRow=2, cexCol=2)
# normalized based on cities
aheatmap(nba_matrix, color = "-RdBu:50", scale = "column", Rowv=FALSE, Colv=FALSE,fontsize=5, cexRow=2, cexCol=2)
# normalized based on months
aheatmap(nba_matrix, color = "-RdBu:50", scale = "row", Rowv=FALSE, Colv=FALSE,fontsize=3, cexRow=3, cexCol=2)
# creating a sample data.frame with your lat/lon points
lon <- data.tabular$lon
lat <- data.tabular$lat
df <- as.data.frame(cbind(lon,lat))
# getting the map
mapgilbert <- get_map(location = c(lon = mean(df$lon), lat = mean(df$lat)), zoom = 4,
maptype = "terrain", scale = 1)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=36.756037,-96.415081&zoom=4&size=640x640&scale=1&maptype=terrain&language=en-EN&sensor=false
# plotting the map with some points on it
data.tabular$sept.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$September))
data.tabular$oct.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$October))
data.tabular$aug.price <- as.numeric(gsub("[^[:digit:]]",'',data.tabular$August))
ggmap(mapgilbert) + geom_point(data = data.tabular, aes(x = lon, y = lat, fill = oct.price,label=city), size = 5, shape = 21)
ggmap(mapgilbert) +
geom_point(data = data.tabular, aes(x = lon, y = lat, fill = sept.price),size = 5, shape = 21) +
geom_point(data = data.tabular, aes(x = lon, y = lat+.7, fill = oct.price),size = 5, shape = 22) +
geom_point(data = data.tabular, aes(x = lon+.7, y = lat, fill = aug.price),size = 5, shape = 24) +
# scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = mean(data.tabular$oct.price))
scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = 250)
ggplot(data = data.tabular, aes(x = lon, y = lat,label=city)) +
geom_point(aes(fill = oct.price),size = 5, shape = 21) +
geom_text(aes(label=city),hjust=0, vjust=1.5) +
scale_fill_gradient2(low = "#0000FF", high ="#FF0000", midpoint = 250)
We can’t continue with a bunch of numbers. Let’s extract specific statistics which we can use for further studies. WE categorize cities based on their price average, and their price trend and later will add information about their climate, population, number of hotels, etc.
# getting the price statistics
data.summary = data.frame(city=unique(data$city)) # a data frame for summary of data
d <- aggregate(data$price, by=list(city=data$city), FUN=mean) # temporary mean
data.summary$price.mean <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x})
d <- aggregate(data$price, by=list(city=data$city), FUN=sd) # temporary sd
data.summary$price.sd <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x})
d <- aggregate(data$price, by=list(city=data$city), FUN=range) # temporary range
data.summary$price.range <- sapply(data.summary$city,FUN=function(x){d[d$city==x,]$x[2] - d[d$city==x,]$x[1]})
head(data.summary)
## city price.mean price.sd price.range
## 1 New York 344.1 70.345891 187
## 2 Los Angeles 219.7 18.541545 56
## 3 Chicago 259.9 62.939918 179
## 4 Dallas 151.3 6.429965 20
## 5 Philadelphia 217.2 27.336585 75
## 6 Houston 153.9 8.685237 30
# assinging price classes
data.summary$price.mean.class <- cut(data.summary$price.mean,breaks=3)
data.summary$price.sd.class <- cut(data.summary$price.sd,breaks=2)
data.summary$price.range.class <- cut(data.summary$price.range,breaks=2)
head(data.summary)
## city price.mean price.sd price.range price.mean.class
## 1 New York 344.1 70.345891 187 (284,359]
## 2 Los Angeles 219.7 18.541545 56 (210,284]
## 3 Chicago 259.9 62.939918 179 (210,284]
## 4 Dallas 151.3 6.429965 20 (135,210]
## 5 Philadelphia 217.2 27.336585 75 (210,284]
## 6 Houston 153.9 8.685237 30 (135,210]
## price.sd.class price.range.class
## 1 (44.5,82.9] (126,235]
## 2 (6.12,44.5] (17.8,126]
## 3 (44.5,82.9] (126,235]
## 4 (6.12,44.5] (17.8,126]
## 5 (6.12,44.5] (17.8,126]
## 6 (6.12,44.5] (17.8,126]
# let's check the price distributions
ggplot(data=data.summary, aes(x=price.mean.class,y=price.mean)) + geom_boxplot()
ggplot(data=data.summary, aes(x=price.sd.class,y=price.sd)) + geom_boxplot()
ggplot(data=data.summary, aes(x=price.range.class,y=price.range)) + geom_boxplot()
Now, we bring the GPS data to data.summary.
data.summary$lat <- sapply(data.summary$city,function(x) {as.numeric(data.tabular[data.tabular$city == x,]$lat[1])})
data.summary$lon <- sapply(data.summary$city,function(x) {as.numeric(data.tabular[data.tabular$city == x,]$lon[1])})
ggplot(data=data.summary, aes(x=lon,y=lat,color=price.mean.class)) + geom_point(aes(shape=price.sd.class),size=4) + geom_text(aes(label=city),hjust=0, vjust=0)
From the results of last plot we understand that not the low price cities don’t have much variance in their price. Let’s check it here
data.summary$price.mean
## [1] 344.1 219.7 259.9 151.3 217.2 153.9 294.5 195.3 174.6 358.7 297.8
## [12] 165.3 144.4 185.9 150.5 138.9 135.0 138.3 194.7 214.9 153.3 249.1
## [23] 196.8 137.1 198.1
data.summary$class.price[data.summary$price.mean <= 210] <- "cheap"
data.summary$class.price[data.summary$price.mean <= 284 & data.summary$price.mean > 210] <- "moderate"
data.summary$class.price[data.summary$price.mean > 284] <- "expensive"
#data.summary$class.stability[data.summary$price.sd <= 44.5] <- "stable"
#data.summary$class.stability[data.summary$price.sd > 44.5] <- "unstable"
data.summary$class.stability[data.summary$price.range <= 100] <- "stable"
data.summary$class.stability[data.summary$price.range > 100] <- "unstable"
ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "cheap"]), aes(x=date,y=price,color=city)) +
geom_point() + geom_smooth(se=FALSE) +
ggtitle("Cheap Cities") +
scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "moderate"]), aes(x=date,y=price,color=city)) +
geom_point() + geom_smooth(se=FALSE) +
ggtitle("Moderate Cities") +
scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(subset(data,city %in% data.summary$city[data.summary$class.price == "expensive"]), aes(x=date,y=price,color=city)) +
geom_point() + geom_smooth(se=FALSE) +
ggtitle("Expensive Cities") +
scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(subset(data,city %in% data.summary$city[data.summary$class.stability == "stable"]), aes(x=date,y=price,color=city)) +
geom_point() + geom_smooth(se=FALSE) +
ggtitle("Stable Cities") +
scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(subset(data,city %in% data.summary$city[data.summary$class.stability == "unstable"]), aes(x=date,y=price,color=city)) +
geom_point() + geom_smooth(se=FALSE) +
ggtitle("Unstable Cities") +
scale_y_continuous(limits = c(90,500))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
Using the latter diagrm we make the following new categories.
data$trend[data$city %in% c("Phoenix","Miami")] <- "updownup"
data$trend[data$city %in% c("Boston","Meanneapolis","Chicago","New York")] <- "upupdownup"
data$trend[data$city %in% c("San Francisco")] <- "upupup"
data$trend[data$city %in% c("Seattle")] <- "upupupdown"
p1 <- ggplot(subset(data, trend=="updownup"), aes(x=date,y=price,color=city)) +
geom_point() +
geom_smooth(se=FALSE,aes(color=city)) +
ggtitle("three different trends") +
scale_y_continuous(limits = c(90,500))
p2 <- ggplot(subset(data, trend=="upupdownup"), aes(x=date,y=price,color=city)) +
geom_point() +
geom_smooth(se=FALSE,aes(color=city)) +
ggtitle("three different trends") +
scale_y_continuous(limits = c(90,500))
p3 <- ggplot(subset(data, trend=="upupup"), aes(x=date,y=price,color=city)) +
geom_point() +
geom_smooth(se=FALSE,aes(color=city)) +
ggtitle("three different trends") +
scale_y_continuous(limits = c(90,500))
p4 <- ggplot(subset(data, trend=="upupupdown"), aes(x=date,y=price,color=city)) +
geom_point() +
geom_smooth(se=FALSE,aes(color=city)) +
ggtitle("three different trends") +
scale_y_continuous(limits = c(90,500))
grid.arrange(p1,p2,p3,p4)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
It seems that every city is following a semi-sinusoidal trend (Probably it’s following the weather conditions). Let’s find the picks!
p <- lapply(data.summary$city,function(the_city) {
price.train = get.predictors(1:9)
price.train$y = data$price[data$city==the_city][1:9]
price.trend = get.predictors(1:10)
price.trend$y = data$price[data$city==the_city][1:10]
price.prediction <- get.predictions(price.train,10)
ggplot(price.trend, aes(x=t,y=y)) +
geom_point() +
geom_point(data=price.prediction, aes(x=t,y=y)) +
geom_line(data=price.prediction, aes(x=t,y=y),color="red") +
geom_line() +
ggtitle(the_city)
})
For some of the cities we have good predictions for October
do.call(grid.arrange, c(p[c(3,4,19,21,22,17)], list(ncol=3)))
For some other cities, the predictions don’t work that well. This can be because of unknwon disturbances that our model doesn’t know.
do.call(grid.arrange, c(p[c(1,12,20,23)], list(ncol=2)))
## Calculating the Estimation Error [todo][TODO]
# plotting the predictions
p <- lapply(data.summary$city,function(the_city) {
price.train = get.predictors(1:10)
price.train$y = data$price[data$city==the_city][1:10]
price.trend = get.predictors(1:10)
price.trend$y = data$price[data$city==the_city][1:10]
price.prediction <- get.predictions(price.train,12)
ggplot(price.trend, aes(x=t,y=y)) +
geom_point() +
geom_point(data=price.prediction, aes(x=t,y=y)) +
geom_line(data=price.prediction, aes(x=t,y=y),color="red") +
geom_line() +
ggtitle(the_city)
})
do.call(grid.arrange, c(p[c(1:9)], list(ncol=3)))
do.call(grid.arrange, c(p[c(10:18)], list(ncol=3)))
do.call(grid.arrange, c(p[c(19:25)], list(ncol=3)))
Filling in the prices in the matrix
data.to.be.verified <- data.csv
for (the_city in data.summary$city) {
price.train = get.predictors(1:10)
price.train$y = data$price[data$city==the_city][1:10]
price.prediction <- get.predictions(price.train,12)
data.to.be.verified$November[data.tabular$city==the_city] <- round(price.prediction$y[11])
data.to.be.verified$December[data.tabular$city==the_city] <- round(price.prediction$y[12])
}
data.to.be.verified[c(1,7:11,14,15)]
## city June July August September October November December
## 1 New York 372 317 320 430 430 418 327
## 2 Los Angeles 227 252 245 220 219 201 195
## 3 Chicago 311 299 267 293 307 279 219
## 4 Dallas 148 145 147 157 163 161 153
## 5 Philadelphia 244 200 198 247 241 238 206
## 6 Houston 146 149 143 150 154 156 153
## 7 Washington D.C. 321 282 238 291 346 321 263
## 8 Miami 153 153 148 147 182 193 219
## 9 Atlanta 172 198 169 185 184 179 172
## 10 Boston 410 389 377 413 454 408 314
## 11 San Francisco 311 324 341 348 350 307 260
## 12 Detroit 168 172 166 166 165 169 167
## 13 Orlando 138 147 130 126 139 130 133
## 14 San Diego 193 237 207 180 180 149 144
## 15 Las Vegas 132 133 137 158 172 185 170
## 16 Phoenix 108 101 98 117 136 132 140
## 17 Indianapolis 134 139 144 131 134 127 117
## 18 Salt Lake City 137 134 152 146 145 141 134
## 19 Denver 209 212 204 209 203 197 186
## 20 New Orleans 199 195 172 177 234 219 209
## 21 St. Louis 159 163 154 165 164 153 137
## 22 Seattle 319 321 328 272 228 185 172
## 23 Minneapolis 237 214 215 207 257 232 198
## 24 San Antonio 134 171 138 127 127 107 108
## 25 San Jose 205 206 211 211 209 192 177
```
Time is tight! If I find more time, I’ll do the following.